package org.hscoder.websearcher.dytt;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.hscoder.websearcher.util.JsonUtil;
import org.hscoder.websearcher.util.ThreadUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.*;
import java.util.stream.Collectors;

/**
 * 链接列表抓取程序
 */
public class DyttLinksFetcher {

    private static final Logger logger = LoggerFactory.getLogger(DyttLinksFetcher.class);

    //页面链接的模板
    public static final String page_url_template = "http://www.ygdy8.net/html/gndy/dyzz/list_23_%d.html";
    //详情页链接的模板
    public static final String detail_url_format = "/html/gndy/dyzz/\\d+{6}/\\d+{1,20}\\.html";
    //主站点地址
    public static final String mainSite = "http://www.ygdy8.net";

    /**
     * 检查是否详情页链接
     *
     * @param link
     * @return
     */
    private static String checkDetailUrl(String link) {
        if (StringUtils.isEmpty(link) || !link.matches(detail_url_format)) {
            return null;
        }
        return mainSite + link;
    }

    /**
     * 抓取某个列表页面的条目链接
     *
     * @param pageUrl
     * @return
     */
    private static List<String> fetchPageLinks(String pageUrl) {
        try {
            Set<String> links = new LinkedHashSet<>();
            Document doc = Jsoup.parse(new URL(pageUrl), 15000);

            //遍历链接节点
            for (Element linkNode : doc.select(".co_content8 ul table .ulink")) {
                String url = linkNode.attr("href");
                url = checkDetailUrl(url);

                if (!StringUtils.isEmpty(url)) {
                    links.add(url);
                }
            }

            return links.stream().collect(Collectors.toList());

        } catch (Exception e) {
            logger.error("fetch links failed at {}", pageUrl, e);
            return Collections.emptyList();
        }
    }

    /**
     * 抓取一定页数的列表页上的条目链接
     *
     * @param pageCount
     * @return
     */
    private static Map<String, List<String>> fetchLinks(int pageCount) {

        Map<String, List<String>> results = new LinkedHashMap<>();
        for (int i = 0; i < pageCount; i++) {
            String pageUrl = String.format(page_url_template, (i + 1));

            List<String> urls = fetchPageLinks(pageUrl);
            logger.info("fetch {} links from {}", urls.size(), pageUrl);

            results.put((i + 1) + "", urls);
            ThreadUtil.waitSome(200);
        }

        logger.info("finish.");
        return results;
    }

    public static void main(String[] args) throws IOException {

        //抽取全部链接
        Map<String, List<String>> results = fetchLinks(100);

        //写入文件
        String content = JsonUtil.toPrettyJson(results);

        File dir = new File("D:\\temp\\dytt");
        if(!dir.exists()){
            dir.mkdirs();
        }

        File linksFile = new File(dir, "dytt.links");
        FileUtils.write(linksFile, content, "UTF-8");
        logger.info("save {} page to file {}", results.size(), linksFile.getAbsolutePath());
    }
}
